import pandas as pd
import json
from openai import OpenAI
from pathlib import Path
import time
from typing import Dict, List
import sys

# Configure LM Studio connection
client = OpenAI(
    base_url="http://127.0.0.1:1234/v1",
    api_key="not-needed"
)

# Define models to use
MODELS = [
    "qwen/qwen3-vl-4b",
    "ibm/granite-3.2-8b",
    "google/gemma-3n-e4b",
    "meta-llama-3-8b-instruct"
]

def analyze_violence(comment_text: str, model_name: str) -> Dict:
    """
    Analyze a single comment for violence content using the specified model.
    """
    # The prompt remains exactly as requested
    prompt = f"""Analyze the following social media comment for violence-related content.

COMMENT:
{comment_text}

Respond ONLY with valid JSON in this exact format (no additional text):
{{
    "discusses_violence": true/false,
    "frequency_score": 0-10,
    "brief_explanation": "one sentence explanation"
}}

Where:
- discusses_violence: Does the comment discuss violence or a violent episode? (true/false)
- frequency_score: How frequent is the reference to violence from 0 (not at all) to 10 (predominant)
- brief_explanation: Brief reason for your assessment

Respond with ONLY the JSON object, nothing else."""

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a research assistant analyzing social media comments. Always respond with valid JSON only."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2,
            max_tokens=200
        )
        
        result_text = response.choices[0].message.content.strip()
        
        # JSON Cleaning logic
        if '```json' in result_text:
            result_text = result_text.split('```json')[1].split('```')[0].strip()
        elif '```' in result_text:
            result_text = result_text.split('```')[1].split('```')[0].strip()
        
        result = json.loads(result_text)
        result['model_used'] = model_name
        return result
        
    except Exception as e:
        return {
            "discusses_violence": None,
            "frequency_score": None,
            "brief_explanation": f"Error: {str(e)}",
            "model_used": model_name
        }

def process_csv_with_model(input_file: str, model_name: str, output_dir: str) -> pd.DataFrame:
    """
    Process the specific BERT positives CSV with a chosen model.
    """
    Path(output_dir).mkdir(exist_ok=True)
    model_safe_name = model_name.replace('/', '_').replace('-', '_')
    
    # Load DataFrame
    try:
        # Note: Added 'on_bad_lines' to handle potential multiline text issues in social media comments
        df = pd.read_csv(input_file, on_bad_lines='skip')
        print(f"\nProcessing {len(df)} rows with {model_name}...")
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None

    # Target the 'text_original' column as requested
    target_col = 'text_original'
    if target_col not in df.columns:
        print(f"Error: Column '{target_col}' not found in {input_file}")
        return None

    # Results storage
    results = []
    
    for idx, row in df.iterrows():
        text = row[target_col]
        comment_id = row.get('comment_id', idx) # Use ID or index as fallback
        
        if pd.isna(text) or str(text).strip() == '':
            results.append({"discusses_violence": False, "frequency_score": 0, "brief_explanation": "Empty text"})
            continue

        print(f"  [{idx+1}/{len(df)}] Analyzing ID: {comment_id}...", end='\r')
        analysis = analyze_violence(str(text), model_name)
        results.append(analysis)
        time.sleep(0.1)

    # Map results back to dataframe with model-specific suffixes
    res_df = pd.DataFrame(results)
    df[f'violence_{model_safe_name}'] = res_df['discusses_violence']
    df[f'score_{model_safe_name}'] = res_df['frequency_score']
    df[f'expl_{model_safe_name}'] = res_df['brief_explanation']

    output_file = Path(output_dir) / f"results_{model_safe_name}.csv"
    df.to_csv(output_file, index=False)
    return df

def main():
    if len(sys.argv) < 2:
        print("Usage: python script.py bert_only_positives_sample.csv")
        sys.exit(1)

    input_file = sys.argv[1]
    output_dir = "labeling_results"
    
    # Iterate through models and generate individual/combined reports
    all_dfs = []
    for model in MODELS:
        result_df = process_csv_with_model(input_file, model, output_dir)
        if result_df is not None:
            all_dfs.append(result_df)

    print("\n\nProcessing complete. Check the 'labeling_results' folder.")

if __name__ == "__main__":
    main()